scom-cam colab

if (T) {
# load packages
library("tidyverse")
require(gridExtra)
library(grid)
library(viridis)
#library("quarto")
library("irr")
# clear workspace
rm(list=ls())
# load functions
source("../src/functions.R")
}
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ purrr::%||%()   masks base::%||%()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Loading required package: gridExtra


Attaching package: 'gridExtra'


The following object is masked from 'package:dplyr':

    combine


Loading required package: viridisLite

Loading required package: lpSolve

250302: pretty diagrams

clean dataset

# load dataset
#fn = "../csv/haidi-data-231012.csv"
fn = "../csv/haidi-data-231107.csv"
data = read.table(fn, sep='\t', header=T, strip.white=T, stringsAsFactors=F) |> as_tibble()

frequency of categories

#if (!requireNamespace("colorblindr", quietly = TRUE)) install.packages("colorblindr", lib="~/lib/r-cran")
#library(ggplot2)
#library(gridExtra)
#library(dplyr)  # For string manipulation and recode
#library(stringr)  # For str_trunc
#library(colorblindr)  # For extended Okabe-Ito palette
library(RColorBrewer)  # For ColorBrewer palettes

# get graphs
for (i in c(4:9,12:15,18:21,22:32,34)) {
pd = desc_get_okabe(data, get(paste0("dt",sprintf("%02d", i))), i, var_pl=T)

# get table
print(pd[2])
#
write.table(pd[2], paste0("../tmp/haidi-table-v", sprintf("%02d", i), ".csv"), sep="\t", quot=T, row.names=F)
}
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
[[1]]
# A tibble: 10 × 9
# Groups:   v35_country [3]
   v35_country v04_article.type count   prop prop_lab count_lab V1            V2
   <chr>                  <int> <int>  <dbl>    <dbl>     <dbl> <chr>      <int>
 1 dk                         1    26 0.531    0.735       36   Article t…     1
 2 dk                         2     7 0.143    0.398       19.5 Article t…     2
 3 dk                         3     7 0.143    0.255       12.5 Article t…     3
 4 dk                         4     9 0.184    0.0918       4.5 Article t…     4
 5 fi                         1    59 0.776    0.612       46.5 Article t…     1
 6 fi                         2    14 0.184    0.132       10   Article t…     2
 7 fi                         3     3 0.0395   0.0197       1.5 Article t…     3
 8 se                         1    14 0.233    0.883       53   Article t…     1
 9 se                         2    34 0.567    0.483       29   Article t…     2
10 se                         3    12 0.2      0.1          6   Article t…     3
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 9 × 9
# Groups:   v35_country [3]
  v35_country v05_article.size count   prop prop_lab count_lab V1       V2 V3   
  <chr>                  <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int> <chr>
1 dk                         1     7 0.143    0.929       45.5 Arti…     1 Small
2 dk                         2    19 0.388    0.663       32.5 Arti…     2 Medi…
3 dk                         3    23 0.469    0.235       11.5 Arti…     3 Large
4 fi                         1    47 0.618    0.691       52.5 Arti…     1 Small
5 fi                         2    24 0.316    0.224       17   Arti…     2 Medi…
6 fi                         3     5 0.0658   0.0329       2.5 Arti…     3 Large
7 se                         1     9 0.15     0.925       55.5 Arti…     1 Small
8 se                         2    10 0.167    0.767       46   Arti…     2 Medi…
9 se                         3    41 0.683    0.342       20.5 Arti…     3 Large
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 9 × 9
# Groups:   v35_country [3]
  v35_country v06_illustrated count   prop prop_lab count_lab V1        V2 V3   
  <chr>                 <int> <int>  <dbl>    <dbl>     <dbl> <chr>  <int> <chr>
1 dk                        0    14 0.286    0.857       42   Illus…     0 No   
2 dk                        1    23 0.469    0.480       23.5 Illus…     1 Yes,…
3 dk                        2    12 0.245    0.122        6   Illus…     2 Yes …
4 fi                        0    49 0.645    0.678       51.5 Illus…     0 No   
5 fi                        1    25 0.329    0.191       14.5 Illus…     1 Yes,…
6 fi                        2     2 0.0263   0.0132       1   Illus…     2 Yes …
7 se                        0     3 0.0508   0.975       57.5 Illus…     0 No   
8 se                        1    52 0.881    0.508       30   Illus…     1 Yes,…
9 se                        2     4 0.0678   0.0339       2   Illus…     2 Yes …
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 11 × 9
# Groups:   v35_country [3]
   v35_country v07_type.of.illustration count   prop prop_lab count_lab V1      
   <chr>                          <int> <int>  <dbl>    <dbl>     <dbl> <chr>   
 1 dk                                 1    15 0.429   0.786        27.5 Type of…
 2 dk                                 2     3 0.0857  0.529        18.5 Type of…
 3 dk                                 3     9 0.257   0.357        12.5 Type of…
 4 dk                                 4     6 0.171   0.143         5   Type of…
 5 dk                                 5     2 0.0571  0.0286        1   Type of…
 6 fi                                 1    14 0.538   0.731        19   Type of…
 7 fi                                 2    12 0.462   0.231         6   Type of…
 8 se                                 1    38 0.679   0.661        37   Type of…
 9 se                                 2    15 0.268   0.187        10.5 Type of…
10 se                                 4     2 0.0357  0.0357        2   Type of…
11 se                                 5     1 0.0179  0.00893       0.5 Type of…
# ℹ 2 more variables: V2 <int>, V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v08_target.group.in.…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               0    27 0.931    0.534       15.5 Targ…     0
2 dk                               1     2 0.0690   0.0345       1   Targ…     1
3 fi                               0     4 0.267    0.867       13   Targ…     0
4 fi                               1    11 0.733    0.367        5.5 Targ…     1
5 se                               0    39 0.696    0.652       36.5 Targ…     0
6 se                               1    17 0.304    0.152        8.5 Targ…     1
# ℹ abbreviated name: ¹​v08_target.group.in.illustration
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v09_agency.of.target…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               0     1 0.5       0.75        1.5 Agen…     0
2 dk                               1     1 0.5       0.25        0.5 Agen…     1
3 fi                               0     1 0.0909    0.955      10.5 Agen…     0
4 fi                               1    10 0.909     0.455       5   Agen…     1
5 se                               0     7 0.412     0.794      13.5 Agen…     0
6 se                               1    10 0.588     0.294       5   Agen…     1
# ℹ abbreviated name: ¹​v09_agency.of.target.group.in.illustration
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v12_heading.content.…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               0    42 0.857    0.571       28   Head…     0
2 dk                               1     7 0.143    0.0714       3.5 Head…     1
3 fi                               0    67 0.882    0.559       42.5 Head…     0
4 fi                               1     9 0.118    0.0592       4.5 Head…     1
5 se                               0    56 0.933    0.533       32   Head…     0
6 se                               1     4 0.0667   0.0333       2   Head…     1
# ℹ abbreviated name: ¹​v12_heading.content.health
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v13_heading.content.…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               0    46 0.939    0.531       26   Head…     0
2 dk                               1     3 0.0612   0.0306       1.5 Head…     1
3 fi                               0    57 0.75     0.625       47.5 Head…     0
4 fi                               1    19 0.25     0.125        9.5 Head…     1
5 se                               0    44 0.733    0.633       38   Head…     0
6 se                               1    16 0.267    0.133        8   Head…     1
# ℹ abbreviated name: ¹​v13_heading.content.old
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v14_heading.content.d…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    34 0.694    0.653      32   Head…     0
2 dk                                1    15 0.306    0.153       7.5 Head…     1
3 fi                                0    48 0.632    0.684      52   Head…     0
4 fi                                1    28 0.368    0.184      14   Head…     1
5 se                                0    44 0.733    0.633      38   Head…     0
6 se                                1    16 0.267    0.133       8   Head…     1
# ℹ abbreviated name: ¹​v14_heading.content.digital.tekn
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 5 × 9
# Groups:   v35_country [3]
  v35_country v15_heading.content.…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               0    49 1       0.5          24.5 Head…     0
2 fi                               0    75 0.987   0.507        38.5 Head…     0
3 fi                               1     1 0.0132  0.00658       0.5 Head…     1
4 se                               0    55 0.917   0.542        32.5 Head…     0
5 se                               1     5 0.0833  0.0417        2.5 Head…     1
# ℹ abbreviated name: ¹​v15_heading.content.ill
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 28 × 9
# Groups:   v35_country [3]
   v35_country v18_article.content.domin…¹ count   prop prop_lab count_lab V1   
   <chr>                             <int> <int>  <dbl>    <dbl>     <dbl> <chr>
 1 dk                                    1     6 0.122     0.939      46   Arti…
 2 dk                                    2     1 0.0204    0.867      42.5 Arti…
 3 dk                                    3    14 0.286     0.714      35   Arti…
 4 dk                                    4     1 0.0204    0.561      27.5 Arti…
 5 dk                                    5     1 0.0204    0.541      26.5 Arti…
 6 dk                                    6     1 0.0204    0.520      25.5 Arti…
 7 dk                                    8     3 0.0612    0.480      23.5 Arti…
 8 dk                                    9     1 0.0204    0.439      21.5 Arti…
 9 dk                                   10     8 0.163     0.347      17   Arti…
10 dk                                   12    13 0.265     0.133       6.5 Arti…
# ℹ 18 more rows
# ℹ abbreviated name: ¹​v18_article.content.dominant.theme
# ℹ 2 more variables: V2 <int>, V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 20 × 9
# Groups:   v35_country [3]
   v35_country v19_subject.positioning count   prop prop_lab count_lab V1       
   <chr>                         <int> <int>  <dbl>    <dbl>     <dbl> <chr>    
 1 dk                                1    24 0.774    0.613       19   Subject …
 2 dk                                3     3 0.0968   0.177        5.5 Subject …
 3 dk                                6     1 0.0323   0.113        3.5 Subject …
 4 dk                                9     3 0.0968   0.0484       1.5 Subject …
 5 fi                                1    59 0.787    0.607       45.5 Subject …
 6 fi                                2     2 0.0267   0.2         15   Subject …
 7 fi                                3     2 0.0267   0.173       13   Subject …
 8 fi                                4     2 0.0267   0.147       11   Subject …
 9 fi                                8     2 0.0267   0.12         9   Subject …
10 fi                                9     5 0.0667   0.0733       5.5 Subject …
11 fi                               11     3 0.04     0.02         1.5 Subject …
12 se                                1    40 0.667    0.667       40   Subject …
13 se                                2     2 0.0333   0.317       19   Subject …
14 se                                3     5 0.0833   0.258       15.5 Subject …
15 se                                4     1 0.0167   0.208       12.5 Subject …
16 se                                5     4 0.0667   0.167       10   Subject …
17 se                                6     2 0.0333   0.117        7   Subject …
18 se                                7     2 0.0333   0.0833       5   Subject …
19 se                                8     2 0.0333   0.05         3   Subject …
20 se                                9     2 0.0333   0.0167       1   Subject …
# ℹ 2 more variables: V2 <int>, V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 5 × 9
# Groups:   v35_country [3]
  v35_country v20_agency.of.target.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    49 1       0.5         24.5 Agen…     0
2 fi                                0    62 0.816   0.592       45   Agen…     0
3 fi                                1    14 0.184   0.0921       7   Agen…     1
4 se                                0    53 0.883   0.558       33.5 Agen…     0
5 se                                1     7 0.117   0.0583       3.5 Agen…     1
# ℹ abbreviated name: ¹​v20_agency.of.target.group.in.article.voice
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v21_gender.visible.m…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               0    48 0.980    0.510       25   Gend…     0
2 dk                               1     1 0.0204   0.0102       0.5 Gend…     1
3 fi                               0    62 0.816    0.592       45   Gend…     0
4 fi                               1    14 0.184    0.0921       7   Gend…     1
5 se                               0    47 0.783    0.608       36.5 Gend…     0
6 se                               1    13 0.217    0.108        6.5 Gend…     1
# ℹ abbreviated name: ¹​v21_gender.visible.mentioned
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v22_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    24 0.490    0.755      37   Acto…     0
2 dk                                1    25 0.510    0.255      12.5 Acto…     1
3 fi                                0    47 0.618    0.691      52.5 Acto…     0
4 fi                                1    29 0.382    0.191      14.5 Acto…     1
5 se                                0    44 0.733    0.633      38   Acto…     0
6 se                                1    16 0.267    0.133       8   Acto…     1
# ℹ abbreviated name: ¹​v22_actors.mentioned.given.voice.state.gov
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v23_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    37 0.755    0.622      30.5 Acto…     0
2 dk                                1    12 0.245    0.122       6   Acto…     1
3 fi                                0    42 0.553    0.724      55   Acto…     0
4 fi                                1    34 0.447    0.224      17   Acto…     1
5 se                                0    40 0.667    0.667      40   Acto…     0
6 se                                1    20 0.333    0.167      10   Acto…     1
# ℹ abbreviated name: ¹​v23_actors.mentioned.given.voice.region
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v24_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    24 0.490    0.755      37   Acto…     0
2 dk                                1    25 0.510    0.255      12.5 Acto…     1
3 fi                                0    58 0.763    0.618      47   Acto…     0
4 fi                                1    18 0.237    0.118       9   Acto…     1
5 se                                0    43 0.717    0.642      38.5 Acto…     0
6 se                                1    17 0.283    0.142       8.5 Acto…     1
# ℹ abbreviated name: ¹​v24_actors.mentioned.given.voice.municipality
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v25_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    29 0.592    0.704      34.5 Acto…     0
2 dk                                1    20 0.408    0.204      10   Acto…     1
3 fi                                0    47 0.618    0.691      52.5 Acto…     0
4 fi                                1    29 0.382    0.191      14.5 Acto…     1
5 se                                0    45 0.75     0.625      37.5 Acto…     0
6 se                                1    15 0.25     0.125       7.5 Acto…     1
# ℹ abbreviated name: ¹​v25_actors.mentioned.given.voice.agency
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v26_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    26 0.531   0.735       36   Acto…     0
2 dk                                1    23 0.469   0.235       11.5 Acto…     1
3 fi                                0    62 0.816   0.592       45   Acto…     0
4 fi                                1    14 0.184   0.0921       7   Acto…     1
5 se                                0    45 0.75    0.625       37.5 Acto…     0
6 se                                1    15 0.25    0.125        7.5 Acto…     1
# ℹ abbreviated name: ¹​v26_actors.mentioned.given.voice.politician.party
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v27_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    23 0.469    0.765      37.5 Acto…     0
2 dk                                1    26 0.531    0.265      13   Acto…     1
3 fi                                0    43 0.566    0.717      54.5 Acto…     0
4 fi                                1    33 0.434    0.217      16.5 Acto…     1
5 se                                0    43 0.717    0.642      38.5 Acto…     0
6 se                                1    17 0.283    0.142       8.5 Acto…     1
# ℹ abbreviated name:
#   ¹​v27_actors.mentioned.given.voice.physician.nurse.health.staff
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v28_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    27 0.551    0.724      35.5 Acto…     0
2 dk                                1    22 0.449    0.224      11   Acto…     1
3 fi                                0    35 0.461    0.770      58.5 Acto…     0
4 fi                                1    41 0.539    0.270      20.5 Acto…     1
5 se                                0    36 0.6      0.7        42   Acto…     0
6 se                                1    24 0.4      0.2        12   Acto…     1
# ℹ abbreviated name:
#   ¹​v28_actors.mentioned.given.voice.organ.health.care.service
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v29_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    29 0.592    0.704      34.5 Acto…     0
2 dk                                1    20 0.408    0.204      10   Acto…     1
3 fi                                0    51 0.671    0.664      50.5 Acto…     0
4 fi                                1    25 0.329    0.164      12.5 Acto…     1
5 se                                0    43 0.717    0.642      38.5 Acto…     0
6 se                                1    17 0.283    0.142       8.5 Acto…     1
# ℹ abbreviated name: ¹​v29_actors.mentioned.given.voice.scientist
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v30_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    39 0.796    0.602      29.5 Acto…     0
2 dk                                1    10 0.204    0.102       5   Acto…     1
3 fi                                0    59 0.776    0.612      46.5 Acto…     0
4 fi                                1    17 0.224    0.112       8.5 Acto…     1
5 se                                0    40 0.667    0.667      40   Acto…     0
6 se                                1    20 0.333    0.167      10   Acto…     1
# ℹ abbreviated name: ¹​v30_actors.mentioned.given.voice.ngo
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v31_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    31 0.633    0.684      33.5 Acto…     0
2 dk                                1    18 0.367    0.184       9   Acto…     1
3 fi                                0    58 0.763    0.618      47   Acto…     0
4 fi                                1    18 0.237    0.118       9   Acto…     1
5 se                                0    44 0.733    0.633      38   Acto…     0
6 se                                1    16 0.267    0.133       8   Acto…     1
# ℹ abbreviated name: ¹​v31_actors.mentioned.given.voice.industry
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v32_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    13 0.265    0.867      42.5 Acto…     0
2 dk                                1    36 0.735    0.367      18   Acto…     1
3 fi                                0    38 0.5      0.75       57   Acto…     0
4 fi                                1    38 0.5      0.25       19   Acto…     1
5 se                                0    39 0.65     0.675      40.5 Acto…     0
6 se                                1    21 0.35     0.175      10.5 Acto…     1
# ℹ abbreviated name:
#   ¹​v32_actors.mentioned.given.voice.other.citizen.family.relatives
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 8 × 9
# Groups:   v35_country [3]
  v35_country v34_tonality.of.arti…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               1    22 0.449    0.776       38   Tona…     1
2 dk                               2    24 0.490    0.306       15   Tona…     2
3 dk                               3     3 0.0612   0.0306       1.5 Tona…     3
4 fi                               1    26 0.342    0.829       63   Tona…     1
5 fi                               2    33 0.434    0.441       33.5 Tona…     2
6 fi                               3    17 0.224    0.112        8.5 Tona…     3
7 se                               1    20 0.333    0.833       50   Tona…     1
8 se                               2    40 0.667    0.333       20   Tona…     2
# ℹ abbreviated name: ¹​v34_tonality.of.article
# ℹ 1 more variable: V3 <chr>
cat ../tmp/haidi-table-* > ../tmp/haidi-tables.csv

250210: chi square tests

v04_article.type, count values by country and article type

Below is a summary interpretation of the chi-square test results:

  • Test Statistic and Significance:
    The chi-square statistic is 72.89 with 6 degrees of freedom. The associated p-value is extremely small (approximately 1.04 × 10⁻¹³), which is far below any conventional significance level (e.g., 0.05). This provides strong evidence to reject the null hypothesis of independence.

  • Interpretation:
    The results indicate that there is a statistically significant association between country (dk, fi, se) and article type (1, 2, 3, 4). In other words, the distribution of article types is not uniform across the three countries.

  • Observed vs. Expected Frequencies:
    The contingency table shows notable differences between observed counts and expected counts under the assumption of independence. For example:

    • In Denmark (dk), article type 2 was observed only 7 times while about 14.57 were expected, and article type 4 was observed 9 times compared to an expected 2.38.
    • In Finland (fi), article type 1 appears more frequently (observed 59 vs. expected 40.67), and no articles of type 4 were observed even though about 3.70 were expected.
    • In Sweden (se), article type 2 is observed 34 times, which is much higher than the expected count of about 17.84.
  • Conclusion:
    The chi-square test clearly demonstrates that the type of article published is associated with the country. This suggests that factors related to the country may influence the distribution of article types.

In summary, the analysis shows that the differences in article type counts across Denmark, Finland, and Sweden are unlikely to be due to chance, pointing to a meaningful relationship between country and article type.

v05_article.size, count values by country and article size

The chi-square test yielded a statistic of approximately 70.19 with 4 degrees of freedom and a p-value of about 2.07×10⁻¹⁴. This extremely small p-value indicates that there is a statistically significant association between country (dk, fi, se) and article size (small, medium, large). In other words, the observed distribution of article sizes across the three countries is very unlikely to have occurred by chance if the two variables were independent.

Key Observations:

  • Denmark (dk):
    • Observed counts: 7 (small), 19 (medium), 23 (large)
    • Expected counts under independence: approximately 16.69 (small), 14.04 (medium), 18.28 (large)
      The observed count for small articles is lower than expected, while medium and large are higher than expected.
  • Finland (fi):
    • Observed counts: 47 (small), 24 (medium), 5 (large)
    • Expected counts: approximately 25.88 (small), 21.77 (medium), 28.35 (large)
      Here, there is an excess of small articles and a deficit of large articles compared to the expected frequencies.
  • Sweden (se):
    • Observed counts: 9 (small), 10 (medium), 41 (large)
    • Expected counts: approximately 20.43 (small), 17.19 (medium), 22.38 (large)
      In Sweden, the observed count for large articles is much higher than expected, while small and medium counts are lower.

Conclusion:

The significant chi-square test result indicates that the distribution of article sizes is not independent of country. The differences between the observed and expected frequencies suggest that each country has a distinct pattern in article size distribution. For example, Finland appears to favor small articles, whereas Sweden shows a strong tendency toward large articles. These differences are statistically significant and suggest that factors related to each country may be influencing the article size choices.

v06_illustrated

The chi-square test produced a statistic of approximately 68.33 with 4 degrees of freedom and an extremely small p-value (≈ 5.12×10⁻¹⁴). This means we reject the null hypothesis of independence and conclude that there is a statistically significant association between country and illustration status.

Key Observations:

  • Overall Significance:
    The p-value is far below any conventional significance threshold (e.g., 0.05), indicating that the differences in illustration status across countries are highly unlikely to be due to chance.

  • Country-Specific Patterns:

    • Denmark (dk):
      • Observed counts: 14 (no illustration, 0), 23 (one illustration, 1), 12 (two or more illustrations, 2).
      • Expected counts: approximately 17.58, 26.63, and 4.79 respectively.
      • Interpretation: Denmark shows a notable excess in the highest illustration category (v06_illustrated = 2) compared to expectations.
    • Finland (fi):
      • Observed counts: 49 (0), 25 (1), 2 (2).
      • Expected counts: approximately 27.26, 41.30, and 7.43 respectively.
      • Interpretation: Finland has many more articles with no illustrations (v06_illustrated = 0) and far fewer with one or multiple illustrations than expected.
    • Sweden (se):
      • Observed counts: 3 (0), 52 (1), 4 (2).
      • Expected counts: approximately 21.16, 32.07, and 5.77 respectively.
      • Interpretation: Sweden has far fewer non-illustrated articles than expected and a higher count for the one illustration category.

Conclusion:

There is clear evidence that the distribution of illustration status (v06_illustrated) differs significantly by country. Denmark tends to publish a higher proportion of articles with two or more illustrations than expected, Finland tends to have a high proportion of articles with no illustrations, and Sweden shows an overrepresentation in the one illustration category. These differences suggest that country-specific factors or editorial practices may be influencing the use of illustrations.

v07_type.of.illu

The chi-square test produced a statistic of approximately 42.02 with 8 degrees of freedom and an extremely small p-value (≈ 1.34×10⁻⁶). This provides very strong evidence against the null hypothesis of independence, indicating that the distribution of illustration types (v07_type_of_illustration) is significantly different across the three countries.

Key Points:

  • Overall Significance:
    With a p-value far below conventional significance thresholds (e.g., 0.05), we conclude that the type of illustration used is not independent of the country. In other words, the distribution of illustration types varies significantly by country.

  • Observed vs. Expected Frequencies:

    • Denmark (dk):
      • Type 1: Observed 15 vs. Expected ≈ 20.04
      • Type 3: Observed 9 vs. Expected ≈ 2.69
      • Type 4: Observed 6 vs. Expected ≈ 2.39
      • Type 5: Observed 2 vs. Expected ≈ 0.90
        These differences suggest that Denmark has fewer type 1 illustrations and considerably more type 3 and type 4 illustrations than would be expected if there were no association.
    • Finland (fi):
      • Type 1: Observed 14 vs. Expected ≈ 14.89
      • Type 2: Observed 12 vs. Expected ≈ 6.67
      • No articles were observed for types 3, 4, or 5, whereas some counts were expected (e.g., type 3 expected ≈ 2.00).
        This indicates that Finland has an overrepresentation of type 2 illustrations and a lack of articles in the higher illustration categories.
    • Sweden (se):
      • Type 1: Observed 38 vs. Expected ≈ 32.07
      • Type 2: Observed 15 vs. Expected ≈ 14.36
      • Type 3: Observed 0 vs. Expected ≈ 4.31
      • Type 4: Observed 2 vs. Expected ≈ 3.83
      • Type 5: Observed 1 vs. Expected ≈ 1.44
        For Sweden, the count for type 1 is slightly higher than expected, while there are fewer type 3 illustrations than expected.

Conclusion:

The significant chi-square statistic indicates that the type of illustration is associated with the country. Each country shows a distinct pattern: - Denmark appears to favor higher counts in types 3 and 4. - Finland shows a strong emphasis on type 2 illustrations with a near absence of higher illustration categories. - Sweden has a relatively high count of type 1 illustrations and lower counts for type 3.

These findings suggest that country-specific factors or editorial policies may influence the choice or prevalence of illustration types in articles.

v08_target.group

The chi-square test produced a statistic of about 20.79 with 2 degrees of freedom and a very small p-value (≈ 3.06×10⁻⁵). This p-value is far below any common significance threshold (e.g., 0.05), meaning we reject the null hypothesis that the country and target group in illustration are independent.

Key Points:

  • Statistical Significance:
    The small p-value indicates that the observed differences in target group distribution across the three countries (dk, fi, se) are unlikely to have occurred by chance.

  • Observed vs. Expected Frequencies:

    • For Denmark (dk):
      • Observed: 27 for group 0 and 2 for group 1
      • Expected: Approximately 20.3 for group 0 and 8.7 for group 1
        This suggests Denmark has more group 0 and fewer group 1 articles than expected under independence.
    • For Finland (fi):
      • Observed: 4 for group 0 and 11 for group 1
      • Expected: About 10.5 for group 0 and 4.5 for group 1
        This indicates Finland has an overrepresentation of group 1 and an underrepresentation of group 0.
    • For Sweden (se):
      • Observed: 39 for group 0 and 17 for group 1
      • Expected: Roughly 39.2 for group 0 and 16.8 for group 1
        The observed counts for Sweden are very close to what would be expected.
  • Overall Interpretation:
    The significant chi-square test result tells us that the distribution of the target group in illustration (group 0 vs. group 1) differs by country. Denmark and Finland show marked deviations from the expected frequencies under the assumption of independence, while Sweden’s distribution is close to the expected values. This indicates that country-specific factors may be influencing the categorization into target groups.

In summary, there is strong evidence of an association between the country and the target group in illustration.

231110: clean dataset

# load dataset
#fn = "../csv/haidi-data-231012.csv"
fn = "../csv/haidi-data-231107.csv"
data = read.table(fn, sep='\t', header=T, strip.white=T, stringsAsFactors=F) |> as_tibble()

frequency, news source, year

if (T) {
#
result_df = data |> 
left_join(dt03, by=join_by(v03_news.source==V2)) |> 
mutate(cn=paste(v35_country, V3), cc=as.factor(v35_country), v36_date_year=str_extract(v02_date, "\\d{4}")) |>
group_by(cn, v36_date_year) |> 
summarize(count=n())
#
cat(simplermarkdown::md_table(result_df))
}
`summarise()` has grouped output by 'cn'. You can override using the `.groups`
argument.
cn v36_date_year count
dk EB 2017 1
dk FS 2017 1
dk FS 2018 2
dk FS 2019 3
dk FS 2021 3
dk FS 2022 1
dk JV 2017 2
dk JV 2018 1
dk JV 2019 1
dk JV 2021 1
dk JV 2022 2
dk NJ 2018 1
dk NJ 2020 1
dk NJ 2022 1
dk PO 2017 3
dk PO 2018 2
dk PO 2019 9
dk PO 2020 4
dk PO 2021 5
dk PO 2022 5
fi HS 2017 2
fi HS 2018 1
fi HS 2019 7
fi HS 2020 3
fi HS 2021 2
fi IS 2017 1
fi IS 2020 2
fi IS 2022 1
fi K 2017 1
fi K 2019 1
fi K 2020 4
fi K 2021 3
fi K 2022 5
fi LK 2017 3
fi LK 2020 2
fi LK 2021 5
fi LK 2022 5
fi TS 2017 5
fi TS 2018 1
fi TS 2019 6
fi TS 2020 5
fi TS 2021 3
fi TS 2022 8
se AB 2017 2
se AB 2018 2
se AB 2019 1
se AB 2020 2
se AB 2021 1
se AB 2022 1
se DN 2017 1
se DN 2018 3
se DN 2019 3
se DN 2020 1
se DN 2021 4
se DN 2022 1
se GP 2017 4
se GP 2018 3
se GP 2019 2
se GP 2020 1
se GP 2021 1
se GP 2022 1
se SDS 2018 1
se SDS 2019 2
se SDS 2020 3
se SDS 2021 3
se SDS 2022 1
se VK 2017 2
se VK 2018 2
se VK 2019 7
se VK 2020 2
se VK 2021 2
se VK 2022 1
if (F) {
#
result_df = data |> 
mutate(v36_date_year = str_extract(v02_date, "\\d{4}")) |> 
select(v35_country, v36_date_year)
#
cat(simplermarkdown::md_table(result_df))
}
pd = data |> group_by(v04_article.type) |> summarize(mean=mean(v05_article.size)) |> left_join(dt04, by=join_by(v04_article.type==V2))

news sources

#
pd = data |> group_by(v03_news.source, v35_country) |> summarize(count=n()) |> 
left_join(dt03, by=join_by(v03_news.source==V2)) |> 
mutate(cn=paste(v35_country, V3), cc=as.factor(v35_country))
`summarise()` has grouped output by 'v03_news.source'. You can override using
the `.groups` argument.
p1 = ggplot(pd, aes(x=v35_country, y=count, fill=cn)) +
    geom_bar(stat="identity", position=position_dodge()) + 
    scale_fill_discrete(name=pd$V1[1]) + 
    labs(y="count", x="v35_country", title="v03_news.source by v35_country")

#    scale_fill_discrete(name=pd$V1[1], labels=V3)
#    scale_fill_manual(values=pd$cx)
#    scale_color_viridis(discrete=T) +
#p1 + scale_fill_brewer(colorRampPalette(brewer.pal(9,"YlOrRd"))(50))
#p1 + scale_fill_manual(values=colorRampPalette(brewer.pal(9,"Spectral"))(nrow(unique(pd[,1]))))
p1 + scale_fill_manual(values=viridis(nrow(unique(pd[,1]))))
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.

frequency charts

# get graphs
for (i in c(4:9,12:15,18:21,22:32,34)) {
pd = desc_get(data, get(paste0("dt",sprintf("%02d", i))), i, var_pl=T)

# get table
print(pd[2])
#
write.table(pd[2], paste0("../tmp/haidi-table-v", sprintf("%02d", i), ".csv"), sep="\t", quot=T, row.names=F)
}
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
[[1]]
# A tibble: 10 × 9
# Groups:   v35_country [3]
   v35_country v04_article.type count   prop prop_lab count_lab V1            V2
   <chr>                  <int> <int>  <dbl>    <dbl>     <dbl> <chr>      <int>
 1 dk                         1    26 0.531    0.735       36   Article t…     1
 2 dk                         2     7 0.143    0.398       19.5 Article t…     2
 3 dk                         3     7 0.143    0.255       12.5 Article t…     3
 4 dk                         4     9 0.184    0.0918       4.5 Article t…     4
 5 fi                         1    59 0.776    0.612       46.5 Article t…     1
 6 fi                         2    14 0.184    0.132       10   Article t…     2
 7 fi                         3     3 0.0395   0.0197       1.5 Article t…     3
 8 se                         1    14 0.233    0.883       53   Article t…     1
 9 se                         2    34 0.567    0.483       29   Article t…     2
10 se                         3    12 0.2      0.1          6   Article t…     3
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 9 × 9
# Groups:   v35_country [3]
  v35_country v05_article.size count   prop prop_lab count_lab V1       V2 V3   
  <chr>                  <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int> <chr>
1 dk                         1     7 0.143    0.929       45.5 Arti…     1 Small
2 dk                         2    19 0.388    0.663       32.5 Arti…     2 Medi…
3 dk                         3    23 0.469    0.235       11.5 Arti…     3 Large
4 fi                         1    47 0.618    0.691       52.5 Arti…     1 Small
5 fi                         2    24 0.316    0.224       17   Arti…     2 Medi…
6 fi                         3     5 0.0658   0.0329       2.5 Arti…     3 Large
7 se                         1     9 0.15     0.925       55.5 Arti…     1 Small
8 se                         2    10 0.167    0.767       46   Arti…     2 Medi…
9 se                         3    41 0.683    0.342       20.5 Arti…     3 Large
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 9 × 9
# Groups:   v35_country [3]
  v35_country v06_illustrated count   prop prop_lab count_lab V1        V2 V3   
  <chr>                 <int> <int>  <dbl>    <dbl>     <dbl> <chr>  <int> <chr>
1 dk                        0    14 0.286    0.857       42   Illus…     0 No   
2 dk                        1    23 0.469    0.480       23.5 Illus…     1 Yes,…
3 dk                        2    12 0.245    0.122        6   Illus…     2 Yes …
4 fi                        0    49 0.645    0.678       51.5 Illus…     0 No   
5 fi                        1    25 0.329    0.191       14.5 Illus…     1 Yes,…
6 fi                        2     2 0.0263   0.0132       1   Illus…     2 Yes …
7 se                        0     3 0.0508   0.975       57.5 Illus…     0 No   
8 se                        1    52 0.881    0.508       30   Illus…     1 Yes,…
9 se                        2     4 0.0678   0.0339       2   Illus…     2 Yes …
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 11 × 9
# Groups:   v35_country [3]
   v35_country v07_type.of.illustration count   prop prop_lab count_lab V1      
   <chr>                          <int> <int>  <dbl>    <dbl>     <dbl> <chr>   
 1 dk                                 1    15 0.429   0.786        27.5 Type of…
 2 dk                                 2     3 0.0857  0.529        18.5 Type of…
 3 dk                                 3     9 0.257   0.357        12.5 Type of…
 4 dk                                 4     6 0.171   0.143         5   Type of…
 5 dk                                 5     2 0.0571  0.0286        1   Type of…
 6 fi                                 1    14 0.538   0.731        19   Type of…
 7 fi                                 2    12 0.462   0.231         6   Type of…
 8 se                                 1    38 0.679   0.661        37   Type of…
 9 se                                 2    15 0.268   0.187        10.5 Type of…
10 se                                 4     2 0.0357  0.0357        2   Type of…
11 se                                 5     1 0.0179  0.00893       0.5 Type of…
# ℹ 2 more variables: V2 <int>, V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v08_target.group.in.…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               0    27 0.931    0.534       15.5 Targ…     0
2 dk                               1     2 0.0690   0.0345       1   Targ…     1
3 fi                               0     4 0.267    0.867       13   Targ…     0
4 fi                               1    11 0.733    0.367        5.5 Targ…     1
5 se                               0    39 0.696    0.652       36.5 Targ…     0
6 se                               1    17 0.304    0.152        8.5 Targ…     1
# ℹ abbreviated name: ¹​v08_target.group.in.illustration
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v09_agency.of.target…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               0     1 0.5       0.75        1.5 Agen…     0
2 dk                               1     1 0.5       0.25        0.5 Agen…     1
3 fi                               0     1 0.0909    0.955      10.5 Agen…     0
4 fi                               1    10 0.909     0.455       5   Agen…     1
5 se                               0     7 0.412     0.794      13.5 Agen…     0
6 se                               1    10 0.588     0.294       5   Agen…     1
# ℹ abbreviated name: ¹​v09_agency.of.target.group.in.illustration
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v12_heading.content.…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               0    42 0.857    0.571       28   Head…     0
2 dk                               1     7 0.143    0.0714       3.5 Head…     1
3 fi                               0    67 0.882    0.559       42.5 Head…     0
4 fi                               1     9 0.118    0.0592       4.5 Head…     1
5 se                               0    56 0.933    0.533       32   Head…     0
6 se                               1     4 0.0667   0.0333       2   Head…     1
# ℹ abbreviated name: ¹​v12_heading.content.health
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v13_heading.content.…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               0    46 0.939    0.531       26   Head…     0
2 dk                               1     3 0.0612   0.0306       1.5 Head…     1
3 fi                               0    57 0.75     0.625       47.5 Head…     0
4 fi                               1    19 0.25     0.125        9.5 Head…     1
5 se                               0    44 0.733    0.633       38   Head…     0
6 se                               1    16 0.267    0.133        8   Head…     1
# ℹ abbreviated name: ¹​v13_heading.content.old
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v14_heading.content.d…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    34 0.694    0.653      32   Head…     0
2 dk                                1    15 0.306    0.153       7.5 Head…     1
3 fi                                0    48 0.632    0.684      52   Head…     0
4 fi                                1    28 0.368    0.184      14   Head…     1
5 se                                0    44 0.733    0.633      38   Head…     0
6 se                                1    16 0.267    0.133       8   Head…     1
# ℹ abbreviated name: ¹​v14_heading.content.digital.tekn
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 5 × 9
# Groups:   v35_country [3]
  v35_country v15_heading.content.…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               0    49 1       0.5          24.5 Head…     0
2 fi                               0    75 0.987   0.507        38.5 Head…     0
3 fi                               1     1 0.0132  0.00658       0.5 Head…     1
4 se                               0    55 0.917   0.542        32.5 Head…     0
5 se                               1     5 0.0833  0.0417        2.5 Head…     1
# ℹ abbreviated name: ¹​v15_heading.content.ill
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 28 × 9
# Groups:   v35_country [3]
   v35_country v18_article.content.domin…¹ count   prop prop_lab count_lab V1   
   <chr>                             <int> <int>  <dbl>    <dbl>     <dbl> <chr>
 1 dk                                    1     6 0.122     0.939      46   Arti…
 2 dk                                    2     1 0.0204    0.867      42.5 Arti…
 3 dk                                    3    14 0.286     0.714      35   Arti…
 4 dk                                    4     1 0.0204    0.561      27.5 Arti…
 5 dk                                    5     1 0.0204    0.541      26.5 Arti…
 6 dk                                    6     1 0.0204    0.520      25.5 Arti…
 7 dk                                    8     3 0.0612    0.480      23.5 Arti…
 8 dk                                    9     1 0.0204    0.439      21.5 Arti…
 9 dk                                   10     8 0.163     0.347      17   Arti…
10 dk                                   12    13 0.265     0.133       6.5 Arti…
# ℹ 18 more rows
# ℹ abbreviated name: ¹​v18_article.content.dominant.theme
# ℹ 2 more variables: V2 <int>, V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 20 × 9
# Groups:   v35_country [3]
   v35_country v19_subject.positioning count   prop prop_lab count_lab V1       
   <chr>                         <int> <int>  <dbl>    <dbl>     <dbl> <chr>    
 1 dk                                1    24 0.774    0.613       19   Subject …
 2 dk                                3     3 0.0968   0.177        5.5 Subject …
 3 dk                                6     1 0.0323   0.113        3.5 Subject …
 4 dk                                9     3 0.0968   0.0484       1.5 Subject …
 5 fi                                1    59 0.787    0.607       45.5 Subject …
 6 fi                                2     2 0.0267   0.2         15   Subject …
 7 fi                                3     2 0.0267   0.173       13   Subject …
 8 fi                                4     2 0.0267   0.147       11   Subject …
 9 fi                                8     2 0.0267   0.12         9   Subject …
10 fi                                9     5 0.0667   0.0733       5.5 Subject …
11 fi                               11     3 0.04     0.02         1.5 Subject …
12 se                                1    40 0.667    0.667       40   Subject …
13 se                                2     2 0.0333   0.317       19   Subject …
14 se                                3     5 0.0833   0.258       15.5 Subject …
15 se                                4     1 0.0167   0.208       12.5 Subject …
16 se                                5     4 0.0667   0.167       10   Subject …
17 se                                6     2 0.0333   0.117        7   Subject …
18 se                                7     2 0.0333   0.0833       5   Subject …
19 se                                8     2 0.0333   0.05         3   Subject …
20 se                                9     2 0.0333   0.0167       1   Subject …
# ℹ 2 more variables: V2 <int>, V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 5 × 9
# Groups:   v35_country [3]
  v35_country v20_agency.of.target.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    49 1       0.5         24.5 Agen…     0
2 fi                                0    62 0.816   0.592       45   Agen…     0
3 fi                                1    14 0.184   0.0921       7   Agen…     1
4 se                                0    53 0.883   0.558       33.5 Agen…     0
5 se                                1     7 0.117   0.0583       3.5 Agen…     1
# ℹ abbreviated name: ¹​v20_agency.of.target.group.in.article.voice
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v21_gender.visible.m…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               0    48 0.980    0.510       25   Gend…     0
2 dk                               1     1 0.0204   0.0102       0.5 Gend…     1
3 fi                               0    62 0.816    0.592       45   Gend…     0
4 fi                               1    14 0.184    0.0921       7   Gend…     1
5 se                               0    47 0.783    0.608       36.5 Gend…     0
6 se                               1    13 0.217    0.108        6.5 Gend…     1
# ℹ abbreviated name: ¹​v21_gender.visible.mentioned
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v22_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    24 0.490    0.755      37   Acto…     0
2 dk                                1    25 0.510    0.255      12.5 Acto…     1
3 fi                                0    47 0.618    0.691      52.5 Acto…     0
4 fi                                1    29 0.382    0.191      14.5 Acto…     1
5 se                                0    44 0.733    0.633      38   Acto…     0
6 se                                1    16 0.267    0.133       8   Acto…     1
# ℹ abbreviated name: ¹​v22_actors.mentioned.given.voice.state.gov
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v23_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    37 0.755    0.622      30.5 Acto…     0
2 dk                                1    12 0.245    0.122       6   Acto…     1
3 fi                                0    42 0.553    0.724      55   Acto…     0
4 fi                                1    34 0.447    0.224      17   Acto…     1
5 se                                0    40 0.667    0.667      40   Acto…     0
6 se                                1    20 0.333    0.167      10   Acto…     1
# ℹ abbreviated name: ¹​v23_actors.mentioned.given.voice.region
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v24_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    24 0.490    0.755      37   Acto…     0
2 dk                                1    25 0.510    0.255      12.5 Acto…     1
3 fi                                0    58 0.763    0.618      47   Acto…     0
4 fi                                1    18 0.237    0.118       9   Acto…     1
5 se                                0    43 0.717    0.642      38.5 Acto…     0
6 se                                1    17 0.283    0.142       8.5 Acto…     1
# ℹ abbreviated name: ¹​v24_actors.mentioned.given.voice.municipality
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v25_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    29 0.592    0.704      34.5 Acto…     0
2 dk                                1    20 0.408    0.204      10   Acto…     1
3 fi                                0    47 0.618    0.691      52.5 Acto…     0
4 fi                                1    29 0.382    0.191      14.5 Acto…     1
5 se                                0    45 0.75     0.625      37.5 Acto…     0
6 se                                1    15 0.25     0.125       7.5 Acto…     1
# ℹ abbreviated name: ¹​v25_actors.mentioned.given.voice.agency
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v26_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    26 0.531   0.735       36   Acto…     0
2 dk                                1    23 0.469   0.235       11.5 Acto…     1
3 fi                                0    62 0.816   0.592       45   Acto…     0
4 fi                                1    14 0.184   0.0921       7   Acto…     1
5 se                                0    45 0.75    0.625       37.5 Acto…     0
6 se                                1    15 0.25    0.125        7.5 Acto…     1
# ℹ abbreviated name: ¹​v26_actors.mentioned.given.voice.politician.party
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v27_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    23 0.469    0.765      37.5 Acto…     0
2 dk                                1    26 0.531    0.265      13   Acto…     1
3 fi                                0    43 0.566    0.717      54.5 Acto…     0
4 fi                                1    33 0.434    0.217      16.5 Acto…     1
5 se                                0    43 0.717    0.642      38.5 Acto…     0
6 se                                1    17 0.283    0.142       8.5 Acto…     1
# ℹ abbreviated name:
#   ¹​v27_actors.mentioned.given.voice.physician.nurse.health.staff
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v28_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    27 0.551    0.724      35.5 Acto…     0
2 dk                                1    22 0.449    0.224      11   Acto…     1
3 fi                                0    35 0.461    0.770      58.5 Acto…     0
4 fi                                1    41 0.539    0.270      20.5 Acto…     1
5 se                                0    36 0.6      0.7        42   Acto…     0
6 se                                1    24 0.4      0.2        12   Acto…     1
# ℹ abbreviated name:
#   ¹​v28_actors.mentioned.given.voice.organ.health.care.service
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v29_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    29 0.592    0.704      34.5 Acto…     0
2 dk                                1    20 0.408    0.204      10   Acto…     1
3 fi                                0    51 0.671    0.664      50.5 Acto…     0
4 fi                                1    25 0.329    0.164      12.5 Acto…     1
5 se                                0    43 0.717    0.642      38.5 Acto…     0
6 se                                1    17 0.283    0.142       8.5 Acto…     1
# ℹ abbreviated name: ¹​v29_actors.mentioned.given.voice.scientist
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v30_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    39 0.796    0.602      29.5 Acto…     0
2 dk                                1    10 0.204    0.102       5   Acto…     1
3 fi                                0    59 0.776    0.612      46.5 Acto…     0
4 fi                                1    17 0.224    0.112       8.5 Acto…     1
5 se                                0    40 0.667    0.667      40   Acto…     0
6 se                                1    20 0.333    0.167      10   Acto…     1
# ℹ abbreviated name: ¹​v30_actors.mentioned.given.voice.ngo
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v31_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    31 0.633    0.684      33.5 Acto…     0
2 dk                                1    18 0.367    0.184       9   Acto…     1
3 fi                                0    58 0.763    0.618      47   Acto…     0
4 fi                                1    18 0.237    0.118       9   Acto…     1
5 se                                0    44 0.733    0.633      38   Acto…     0
6 se                                1    16 0.267    0.133       8   Acto…     1
# ℹ abbreviated name: ¹​v31_actors.mentioned.given.voice.industry
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v32_actors.mentioned.…¹ count  prop prop_lab count_lab V1       V2
  <chr>                         <int> <int> <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                                0    13 0.265    0.867      42.5 Acto…     0
2 dk                                1    36 0.735    0.367      18   Acto…     1
3 fi                                0    38 0.5      0.75       57   Acto…     0
4 fi                                1    38 0.5      0.25       19   Acto…     1
5 se                                0    39 0.65     0.675      40.5 Acto…     0
6 se                                1    21 0.35     0.175      10.5 Acto…     1
# ℹ abbreviated name:
#   ¹​v32_actors.mentioned.given.voice.other.citizen.family.relatives
# ℹ 1 more variable: V3 <chr>
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 8 × 9
# Groups:   v35_country [3]
  v35_country v34_tonality.of.arti…¹ count   prop prop_lab count_lab V1       V2
  <chr>                        <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int>
1 dk                               1    22 0.449    0.776       38   Tona…     1
2 dk                               2    24 0.490    0.306       15   Tona…     2
3 dk                               3     3 0.0612   0.0306       1.5 Tona…     3
4 fi                               1    26 0.342    0.829       63   Tona…     1
5 fi                               2    33 0.434    0.441       33.5 Tona…     2
6 fi                               3    17 0.224    0.112        8.5 Tona…     3
7 se                               1    20 0.333    0.833       50   Tona…     1
8 se                               2    40 0.667    0.333       20   Tona…     2
# ℹ abbreviated name: ¹​v34_tonality.of.article
# ℹ 1 more variable: V3 <chr>
cat ../tmp/haidi-table-* > ../tmp/haidi-tables.csv

index variable

#
pd = data |> select(35,33) |> rename(v35=1, v33=2)
p1 = ggplot(pd, aes(x=v35, y=v33, fill=v35)) +
    geom_boxplot(alpha=1.0) +
    stat_summary(fun=mean) +
    labs(y="v33_power.sum.index", x="v35_country", title="v33_power.sum.index by v35_country") + 
    theme(legend.position="none")

#
p1
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_segment()`).

#
#dt03 |> as_tibble()

# load dataset, rename cols, country
fn = "../csv/haidi-data-231012.csv"
#fn = "../csv/haidi-data-231107.csv"
#data = read.table(fn, sep='\t', quote="", header=F, strip.white=TRUE, stringsAsFactors=FALSE) |> as_tibble() |> rename_with(~ cn, all_of(paste0(rep("V",34), seq(1,34))))
data = read.table(fn, sep='\t', header=T, strip.white=T, stringsAsFactors=F) |> as_tibble()

# add country variable
#data = data |> 
#mutate(v35_country = as.numeric(str_extract(data$v03_news.source, "^."))) |> 
#mutate(v35_country = ifelse(v35_country==1,"se", ifelse(v35_country==2,"dk", ifelse(v35_country==3,"fi",NA))))

# clean data
data[,18:21] = data[,18:21] |> map(~str_extract(., "^\\d+") |> as.numeric()) |> as_tibble()
data = data |> rename(v33_power.sum.index=33)

# 
#write.table(data, fn, sep="\t", quot=T, row.names=F)
  • combined dataset link
  • combined figures link

231025: color theme

library(RColorBrewer)
#display.brewer.all()

# custom theme
some_graph <- theme(panel.grid.major=element_line(linewidth=2))
some_color <- c("deeppink", "chartreuse", "midnightblue")
# put the elements in a list
theme_haidi <- list(some_graph, scale_color_manual(values=some_color))
theme_haidi <- list(some_graph, scale_colour_brewer(palette="Blues"))

231019: descriptives

pd = data |> group_by(v03_news.source, v35_country) |> summarize(count=n()) |> 
left_join(dt03, by=join_by(v03_news.source==V2)) |> 
mutate(cn=paste(v35_country, V3), cc=as.factor(v35_country))
`summarise()` has grouped output by 'v03_news.source'. You can override using
the `.groups` argument.
p1 = ggplot(pd, aes(x=v35_country, y=count, fill=cn)) +
    geom_bar(stat="identity", position=position_dodge()) + 
    scale_fill_discrete(name=pd$V1[1]) + 
    labs(y="count", x="v35_country", title="v03_news.source by v35_country")

#    scale_fill_discrete(name=pd$V1[1], labels=V3)
#    scale_fill_manual(values=pd$cx)
#    scale_color_viridis(discrete=T) +
#p1 + scale_fill_brewer(colorRampPalette(brewer.pal(9,"YlOrRd"))(50))
#p1 + scale_fill_manual(values=colorRampPalette(brewer.pal(9,"Spectral"))(nrow(unique(pd[,1]))))
p1 + scale_fill_manual(values=viridis(nrow(unique(pd[,1]))))
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.

pd = data |> select(35,33) |> rename(v35=1, v33=2)
p1 = ggplot(pd, aes(x=v35, y=v33, fill=v35)) +
    geom_boxplot(alpha=1.0) +
    stat_summary(fun=mean) +
    labs(y="v33_power.sum.index", x="v35_country", title="v33_power.sum.index by v35_country") + 
    theme(legend.position="none")

#
p1
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_segment()`).

for (i in 33:34) {
pl = desc_get(data, get(paste0("dt",i)), i)
}
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

231018: descriptives

for (i in 22:32) {
pl = desc_get(data, get(paste0("dt",i)), i)
}
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

for (i in 18:21) {
pl = desc_get(data, get(paste0("dt",i)), i)
}
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

231017: descriptives

for (i in 12:15) {
pl = desc_get(data, get(paste0("dt",i)), i)
}
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

231016: descriptives

for (i in 6:9) {
pl = desc_get(data, get(paste0("dt0",i)), i)
}
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

pl = desc_get(data, dt04, 4)
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

#grid.arrange(pl[1], pl[2], ncol=2, widths=c(3, 4))

231012: descriptives

# plot data, grouped
pdg = data |> 
group_by(v35_country, v05_article.size) |> 
summarize(count=n()) |> 
mutate(prop=count/sum(count)) |> 
mutate(prop_lab=rev(cumsum(rev(prop)))-prop/2) |> 
mutate(count_lab=rev(cumsum(rev(count)))-count/2) |> 
left_join(dt05 |> as_tibble() |> arrange(V2), by=join_by(v05_article.size==V2), keep=T) 
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
#
p1 = ggplot(data=pdg, aes(fill=as.factor(V2), y=count, x=v35_country)) + 
    geom_bar(position="fill", stat="identity") +
    scale_y_continuous(labels=scales::percent) +
#    geom_text(aes(label=paste0(prop*100,"%")), position=position_stack(vjust=0.5), size=2)
    geom_text(aes(label=paste0(round(100*prop,1),"%"), y=prop_lab), size=3) +
    scale_fill_discrete(name=dt05$V1[1], labels=dt05$V3)

#
p2 = ggplot(data=pdg, aes(fill=as.factor(V2), y=count, x=v35_country)) + 
    geom_bar(position="stack", stat="identity") +
    geom_text(aes(label=count, y=count_lab), size=3) +
    scale_fill_discrete(name=dt05$V1[1], labels=dt05$V3)

#
p3 = ggplot(data=pdg, aes(fill=as.factor(V2), y=count, x=v35_country)) + 
    geom_bar(position="stack", stat="identity") +
#    geom_text(aes(label=count, y=count_lab), size=3) +
    geom_text(aes(label=paste0(round(100*prop,1),"%"), y=count_lab), size=3) +
#    scale_fill_discrete(name=dt05$V1[1], labels=dt05$V3)
    scale_fill_discrete(guide="none")

# plot data, combined
pdc = data |> 
group_by(v05_article.size) |> 
summarize(count=n()) |> 
mutate(prop=count/sum(count)) |> 
mutate(prop_lab=rev(cumsum(rev(prop)))-prop/2) |> 
mutate(count_lab=rev(cumsum(rev(count)))-count/2) |> 
left_join(dt05 |> as_tibble(), by=join_by(v05_article.size==V2), keep=T) 

#
p4 = ggplot(data=pdc, aes(fill=as.factor(V2), y=count, x="combined")) + 
    geom_bar(position="fill", stat="identity") +
    scale_y_continuous(labels=scales::percent) +
    geom_text(aes(label=paste0(round(100*prop,1),"%"), y=prop_lab), size=3) +
    scale_fill_discrete(name=dt05$V1[1], labels=dt05$V3) +
    labs(y="percentage", x="v35_country")

#par(mfrow = c(1,2))
#pdf("../fig/foo.pdf")#png()
grid.arrange(p4, p3, ncol=2, widths=c(3, 4))

#dev.off()
  • combined dataset link

231009: dk icr re-test

# load dataset
fn = "../csv/haidi-wp1-coding-dk-2.tsv"
#data = read.table(fn, sep='\t', header=F, strip.white=TRUE, stringsAsFactors=FALSE, quote="")
data = read.table(fn, sep='\t', header=F, strip.white=TRUE, stringsAsFactors=FALSE) |> as_tibble()

# select data
data = data |> 
na.omit() |> 
select(-c(2)) |> 
select(last_col(), 1:29) |> 
rename(coder_id=V31, content_id=V1)
# content_id for both coders
data$some = data$content_id
# clean data
#data$V3 = data$V3 |> str_extract("\\d+") |> as.numeric()

analyze data

# 
write.table(data, "../csv/haidi-dk-2.tsv", sep="\t", quot=T, row.names=F)
# transform data to wide (10 content units * 28 content vars)
data = data |> 
pivot_wider(id_cols=coder_id, names_from=some, values_from=c(3:30)) |> 
select(-coder_id)
# 
data
# A tibble: 2 × 280
  V3_DK001 V3_DK002 V3_DK003 V3_DK004 V3_DK005 V3_DK006 V3_DK007 V3_DK008
     <int>    <int>    <int>    <int>    <int>    <int>    <int>    <int>
1      208      206      208      206      206      206      206      206
2      208      206      208      206      206      206      206      206
# ℹ 272 more variables: V3_DK009 <int>, V3_DK010 <int>, V4_DK001 <int>,
#   V4_DK002 <int>, V4_DK003 <int>, V4_DK004 <int>, V4_DK005 <int>,
#   V4_DK006 <int>, V4_DK007 <int>, V4_DK008 <int>, V4_DK009 <int>,
#   V4_DK010 <int>, V5_DK001 <int>, V5_DK002 <int>, V5_DK003 <int>,
#   V5_DK004 <int>, V5_DK005 <int>, V5_DK006 <int>, V5_DK007 <int>,
#   V5_DK008 <int>, V5_DK009 <int>, V5_DK010 <int>, V6_DK001 <int>,
#   V6_DK002 <int>, V6_DK003 <int>, V6_DK004 <int>, V6_DK005 <int>, …
# inter rater reliability
data = as.matrix(data)
# specify data type: nominal, ordinal, interval, ratio 
kripp.alpha(data, method="nominal")
 Krippendorff's alpha

 Subjects = 280 
   Raters = 2 
    alpha = 0.704 
# write data
fn = "../csv/haidi-wp1-coding-dk-2-wide.csv"
write.table(data, fn, sep="\t", quot=T, row.names=F)
  • dataset wide format link

231004: all

# bash code chunk
head -n1 ../csv/haidi-all.tsv | tr '\t' '\n' | cat -n | head
     1  "coder_id"
     2  "content_id"
     3  "V3"
     4  "V4"
     5  "V5"
     6  "V6"
     7  "V7"
     8  "V8"
     9  "V9"
    10  "V12"
# load datasets
fn = "../csv/haidi-all.tsv"
data = read.table(fn, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE) |> as_tibble()

data |> group_by(content_id, coder_id) |> summarize(count=n())
`summarise()` has grouped output by 'content_id'. You can override using the
`.groups` argument.
# A tibble: 60 × 3
# Groups:   content_id [40]
   content_id coder_id count
   <chr>      <chr>    <int>
 1 DK001      A            1
 2 DK001      B            1
 3 DK002      A            1
 4 DK002      B            1
 5 DK003      A            1
 6 DK003      B            1
 7 DK004      A            1
 8 DK004      B            1
 9 DK005      A            1
10 DK005      B            1
# ℹ 50 more rows
data |> group_by(V4) |> summarize(count=n())
# A tibble: 4 × 2
     V4 count
  <int> <int>
1     1    44
2     2    10
3     3     3
4     4     3
data |> group_by(V4) |> summarize(count=n()) |>
ggplot(aes(x=V4, y=count)) +
#  geom_bar(fill="green", stat="identity") + 
  geom_bar(stat="identity") + 
  theme_minimal()

data |> group_by(V4) |> summarize(count=n()) |> arrange(desc(V4)) |> mutate(prop=round(count*100/sum(count), 1), lab.ypos=cumsum(prop) - 0.5*prop)
# A tibble: 4 × 4
     V4 count  prop lab.ypos
  <int> <int> <dbl>    <dbl>
1     4     3   5        2.5
2     3     3   5        7.5
3     2    10  16.7     18.4
4     1    44  73.3     63.4
data |> group_by(V4) |> summarize(count=n()) |> arrange(desc(V4)) |> mutate(prop=round(count*100/sum(count), 1), lab.ypos=cumsum(prop) - 0.5*prop) |> 
ggplot(aes(x="", y=prop, fill=V4)) +
  geom_bar(width=1, stat="identity", color="white") +
  geom_text(aes(y=lab.ypos, label=prop), color="white") +
  coord_polar("y", start=0) +
  theme_minimal()

231003: all datasets

# load datasets
fn = "../csv/haidi-dk.tsv"
data_dk = read.table(fn, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE)
fn = "../csv/haidi-fi.tsv"
data_fi = read.table(fn, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE)
fn = "../csv/haidi-se.tsv"
data_se = read.table(fn, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE)

data = rbind(data_dk, data_fi, data_se)
data |> as_tibble()
# A tibble: 60 × 31
   coder_id content_id    V3    V4    V5    V6    V7    V8    V9   V12   V13
   <chr>    <chr>      <int> <int> <int> <int> <int> <int> <int> <int> <int>
 1 A        DK001        208     4     3     2     1     0    99     0     0
 2 A        DK002        206     4     3     2     4     0    99     0     0
 3 A        DK003        208     1     2     0    99     0    99     0     0
 4 A        DK004        206     1     3     1     3     0    99     0     0
 5 A        DK005        206     1     2     0    99     0    99     0     0
 6 A        DK006        206     1     3     1     3     0    99     0     0
 7 A        DK007        206     1     3     0    99     0    99     0     0
 8 A        DK008        206     1     2     1     3     0    99     0     0
 9 A        DK009        206     1     2     1     3     0    99     1     0
10 A        DK010        206     1     1     0    99    99    99     0     0
# ℹ 50 more rows
# ℹ 20 more variables: V14 <int>, V15 <int>, V18 <int>, V19 <int>, V20 <int>,
#   V21 <int>, V22 <int>, V23 <int>, V24 <int>, V25 <int>, V26 <int>,
#   V27 <int>, V28 <int>, V29 <int>, V30 <int>, V31 <int>, V32 <int>,
#   V33 <int>, V34 <int>, some <chr>
# 
write.table(data, "../csv/haidi-all.tsv", sep="\t", quot=T, row.names=F)

230525: finnish dataset

# load dataset
fn = "../csv/haidi-wp1-coding-fi.tsv"
#data = read.table(fn, sep='\t', header=F, strip.white=TRUE, stringsAsFactors=FALSE)
data = read.table(fn, sep='\t', header=F, quote="", strip.white=TRUE, stringsAsFactors=FALSE)
# select data
data = data |> 
as_tibble() |> 
na.omit() |> 
select(-c(2,10,11,16,17)) |> 
select(last_col(), 1:29) |> 
rename(coder_id=V35, content_id=V1)
# content_id for both coders
data$some = data$content_id
# clean data
data$V3 = data$V3 |> str_extract("\\d+") |> as.numeric()

analyze data

# 
write.table(data, "../csv/haidi-fi.tsv", sep="\t", quot=T, row.names=F)
# transform data to wide (10 content units * 28 content vars)
data = data |> 
pivot_wider(id_cols=coder_id, names_from=some, values_from=c(3:30)) |> 
select(-coder_id)
# 
data
# A tibble: 2 × 280
  V3_FI001 V3_FI002 V3_FI003 V3_FI004 V3_FI005 V3_FI006 V3_FI007 V3_FI008
     <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
1      315      311      311      313      314      315      315      314
2      315      311      311      313      314      315      315      314
# ℹ 272 more variables: V3_FI009 <dbl>, V3_FI010 <dbl>, V4_FI001 <int>,
#   V4_FI002 <int>, V4_FI003 <int>, V4_FI004 <int>, V4_FI005 <int>,
#   V4_FI006 <int>, V4_FI007 <int>, V4_FI008 <int>, V4_FI009 <int>,
#   V4_FI010 <int>, V5_FI001 <int>, V5_FI002 <int>, V5_FI003 <int>,
#   V5_FI004 <int>, V5_FI005 <int>, V5_FI006 <int>, V5_FI007 <int>,
#   V5_FI008 <int>, V5_FI009 <int>, V5_FI010 <int>, V6_FI001 <int>,
#   V6_FI002 <int>, V6_FI003 <int>, V6_FI004 <int>, V6_FI005 <int>, …
# inter rater reliability
data = as.matrix(data)
# specify data type: nominal, ordinal, interval, ratio 
kripp.alpha(data, method="nominal")
 Krippendorff's alpha

 Subjects = 280 
   Raters = 2 
    alpha = 0.72 
# write data
fn = "../csv/haidi-wp1-coding-fi-wide.csv"
write.table(data, fn, sep="\t", quot=T, row.names=F)
  • dataset wide format link

230523: danish dataset

# load dataset
fn = "../csv/haidi-wp1-coding-dk.tsv"
data = read.table(fn, sep='\t', header=F, strip.white=TRUE, stringsAsFactors=FALSE)
data = read.table(fn, sep='\t', header=F, quote="", strip.white=TRUE, stringsAsFactors=FALSE)
# select data
data = data |> 
as_tibble() |> 
na.omit() |> 
select(-c(2,10,11,16,17)) |> 
select(last_col(), 1:29) |> 
rename(coder_id=V35, content_id=V1)
# content_id for both coders
data$some = data$content_id
# clean data
data$V18 = data$V18 |> str_extract("^\\d+") |> as.numeric()

analyze data

# 
write.table(data, "../csv/haidi-dk.tsv", sep="\t", quot=T, row.names=F)
# transform data to wide (10 content units * 28 content vars)
data = data |> 
pivot_wider(id_cols=coder_id, names_from=some, values_from=c(3:30)) |> 
select(-coder_id)
# 
data
# A tibble: 2 × 280
  V3_DK001 V3_DK002 V3_DK003 V3_DK004 V3_DK005 V3_DK006 V3_DK007 V3_DK008
     <int>    <int>    <int>    <int>    <int>    <int>    <int>    <int>
1      208      206      208      206      206      206      206      206
2      208      206      208      206      206      206      206      206
# ℹ 272 more variables: V3_DK009 <int>, V3_DK010 <int>, V4_DK001 <int>,
#   V4_DK002 <int>, V4_DK003 <int>, V4_DK004 <int>, V4_DK005 <int>,
#   V4_DK006 <int>, V4_DK007 <int>, V4_DK008 <int>, V4_DK009 <int>,
#   V4_DK010 <int>, V5_DK001 <int>, V5_DK002 <int>, V5_DK003 <int>,
#   V5_DK004 <int>, V5_DK005 <int>, V5_DK006 <int>, V5_DK007 <int>,
#   V5_DK008 <int>, V5_DK009 <int>, V5_DK010 <int>, V6_DK001 <int>,
#   V6_DK002 <int>, V6_DK003 <int>, V6_DK004 <int>, V6_DK005 <int>, …
# inter rater reliability
data = as.matrix(data)
# specify data type: nominal, ordinal, interval, ratio 
kripp.alpha(data, method="nominal")
 Krippendorff's alpha

 Subjects = 280 
   Raters = 2 
    alpha = 0.639 
# write data
fn = "../csv/haidi-wp1-coding-dk-wide.csv"
write.table(data, fn, sep="\t", quot=T, row.names=F)
  • dataset wide format link

230515: swedish dataset

# load dataset
fn = "../csv/some.tsv"
data = read.table(fn, sep='\t', header=F, strip.white=TRUE, stringsAsFactors=FALSE)
data = read.table(fn, sep='\t', header=F, quote="", strip.white=TRUE, stringsAsFactors=FALSE)
# select data
data = data |> as_tibble() |> select(-c(2,10,11,16,17)) |> select(last_col(), 1:29) |> rename(coder_id=V35, content_id=V1)
# duplicate content_id for both coders
data$some = rep(data$content_id[data$coder_id=="A"], 2)

230516: analyze data

# 
write.table(data, "../csv/haidi-se.tsv", sep="\t", quot=T, row.names=F)
# transform data to wide (10 content units * 28 content vars)
data = data |> 
pivot_wider(id_cols=coder_id, names_from=some, values_from=c(3:30)) |> 
select(-coder_id)
# 
data
# A tibble: 2 × 280
  V3_S001 V3_S002 V3_S003 V3_S004 V3_S005 V3_S006 V3_S007 V3_S008 V3_S009
    <int>   <int>   <int>   <int>   <int>   <int>   <int>   <int>   <int>
1     102     103     102     101     104     102     103     101     101
2     102     103     102     101     104     102     103     101     101
# ℹ 271 more variables: V3_S010 <int>, V4_S001 <int>, V4_S002 <int>,
#   V4_S003 <int>, V4_S004 <int>, V4_S005 <int>, V4_S006 <int>, V4_S007 <int>,
#   V4_S008 <int>, V4_S009 <int>, V4_S010 <int>, V5_S001 <int>, V5_S002 <int>,
#   V5_S003 <int>, V5_S004 <int>, V5_S005 <int>, V5_S006 <int>, V5_S007 <int>,
#   V5_S008 <int>, V5_S009 <int>, V5_S010 <int>, V6_S001 <int>, V6_S002 <int>,
#   V6_S003 <int>, V6_S004 <int>, V6_S005 <int>, V6_S006 <int>, V6_S007 <int>,
#   V6_S008 <int>, V6_S009 <int>, V6_S010 <int>, V7_S001 <int>, …
# inter rater reliability
data = as.matrix(data)
# specify data type: nominal, ordinal, interval, ratio 
kripp.alpha(data, method="nominal")
 Krippendorff's alpha

 Subjects = 280 
   Raters = 2 
    alpha = 0.784 
# write data
fn = "../csv/test.csv"
write.table(data, fn, sep="\t", quot=T, row.names=F)
  • dataset wide format link

230426: sample dataset

# load dataset
data = readxl::read_excel('../csv/some.xlsx', sheet="Blad1", col_names=paste0("x", seq(34)))
# select data
data = data |> 
mutate(coder_id=c(rep("a",5), rep("b",5)), content_id=rep(seq(5), 2)) |> 
select("coder_id","content_id",1,3,9,10) |> print(n=100)
# A tibble: 10 × 6
   coder_id content_id x1       x3    x9 x10                                    
   <chr>         <int> <chr> <dbl> <dbl> <chr>                                  
 1 a                 1 S001    102    99 Dolda larmsiffrorna: Så dåligt mår 85-…
 2 a                 2 S002    103    99 Satsningar som räddar liv              
 3 a                 3 S003    102    99 Detta måste ni rätta till i vården, po…
 4 a                 4 S004    101    99 Sju utmaningar - därför är det kris i …
 5 a                 5 S005    104    99 De kommande årens satsningar sker i pr…
 6 b                 1 S006    102     0 De har full koll på senioren           
 7 b                 2 S007    103    99 Mossig kritik mot vårdappar            
 8 b                 3 S008    101    99 Folksjukdomar som kan förvärras i spår…
 9 b                 4 S009    101    99 Så vill regeringen möta utmaningarna i…
10 b                 5 S010    105    99 Tekniken ska avlasta personalen        
# transform data
data = data |> 
pivot_wider(id_cols=coder_id, names_from=content_id, values_from=x3) |> 
select(-coder_id)
# https://rpubs.com/jacoblong/content-analysis-krippendorff-alpha-R
data
# A tibble: 2 × 5
    `1`   `2`   `3`   `4`   `5`
  <dbl> <dbl> <dbl> <dbl> <dbl>
1   102   103   102   101   104
2   102   103   101   101   105

230427: analyze data

# inter rater reliability
data = as.matrix(data)
# specify data type: nominal, ordinal, interval, ratio 
kripp.alpha(data, method="nominal")
 Krippendorff's alpha

 Subjects = 5 
   Raters = 2 
    alpha = 0.526 
  • Krippendorff’s Alpha values range from -1 to 1, with 1 representing unanimous agreement between the raters, 0 indicating they’re guessing randomly, and negative values suggesting the raters are systematically disagreeing. As suggested by Krippendorff, alphas above 0.8 are considered very good agreement, and tentative conclusions can be made with data where α≥0.667

sample data

# get some data
data <-
  tribble(
    ~content_id, ~coder_id, ~var1, ~var2,   ~var3,
    1,           "A",       1,     "Red",   FALSE,
    2,           "A",       3,     "Blue",  TRUE,
    3,           "A",       5,     "Blue",  TRUE,
    4,           "A",       7,     "Green", TRUE,
    5,           "A",       1,     "Red",   FALSE,
    1,           "B",       1,     "Red",   FALSE,
    2,           "B",       3,     "Blue",  FALSE,
    3,           "B",       3,     "Green", FALSE,
    4,           "B",       7,     "Green", TRUE,
    5,           "B",       3,     "Red",   FALSE,
  )

data |> print(n=100)
# A tibble: 10 × 5
   content_id coder_id  var1 var2  var3 
        <dbl> <chr>    <dbl> <chr> <lgl>
 1          1 A            1 Red   FALSE
 2          2 A            3 Blue  TRUE 
 3          3 A            5 Blue  TRUE 
 4          4 A            7 Green TRUE 
 5          5 A            1 Red   FALSE
 6          1 B            1 Red   FALSE
 7          2 B            3 Blue  FALSE
 8          3 B            3 Green FALSE
 9          4 B            7 Green TRUE 
10          5 B            3 Red   FALSE

exclude

if (T) {
knitr::knit_exit()
}